In [ ]:
import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost

#pca from sklearn
from sklearn.decomposition import PCA
In [ ]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")
validation_set = pd.read_parquet("data/numerai_validation_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]
In [ ]:
# train catboost model with all features,
# the train on several different numbers of PCA components
# test for erawise correlation on the validation set

N_pca_features = [None, len(feature_names), len(feature_names)//2, len(feature_names)//4, len(feature_names)//8]
N_pca_features
Out[ ]:
[None, 1050, 525, 262, 131]
In [ ]:
params = {
    "iterations":1000,
    "learning_rate":0.01,
    "depth":6,
    "task_type":'GPU',
    "verbose":False,
}

mean_corrs = []
for item in N_pca_features:
    model = catboost.CatBoostRegressor(**params)

    if item is None:
        model.fit(training_set[feature_names], training_set["target"])
        validation_set["validation_prediction"] = model.predict(validation_set[feature_names])
    else:
        pca = PCA(n_components=item)
        pca.fit(training_set[feature_names])
        training_set_pca = pca.transform(training_set[feature_names])
        validation_set_pca = pca.transform(validation_set[feature_names])
        model.fit(training_set_pca, training_set["target"])
        validation_set["validation_prediction"] = model.predict(validation_set_pca)

    era_wise_correlations = validation_set.groupby("era").apply(
        lambda era: np.corrcoef(era["validation_prediction"], era["target"])[0, 1]
    )
    mean_corr = era_wise_correlations.mean()

    print(f"{item} PCA components: {mean_corr}")
    mean_corrs.append(mean_corr)
None PCA components: 0.022954946780619122
1050 PCA components: 0.012031665005416482
525 PCA components: 0.01199597158943061
262 PCA components: 0.012240655156281562
131 PCA components: 0.01108041763419352
In [ ]:
# plot the results
import plotly.express as px

#convert all items in N_pca_features to strings
N_pca_features = [str(item) + " PCA Components" for item in N_pca_features if item is not None]

#replace None in N_pca_features with "No PCA"
N_pca_features[0] = "No PCA (1050 features)"

# Y axis is the mean correlation add to labels
# X axis is the number of PCA components add to labels
fig = px.bar(x=N_pca_features, y=mean_corrs)
fig.update_layout(title_text="Mean Correlation of Validation Set Predictions",
                    xaxis_title="Number of PCA Components",
                    yaxis_title="Mean Correlation")
fig.show(renderer="notebook")